import requests import json import random import os os.chdir('C:/Users/dell/Desktop') import time base_url='https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_1705586121&since_id=' head=[ "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00", "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00", "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0", "Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62", "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62", ] header={ 'user-agent':random.choice(head) } with open('user_id.txt','w') as f: for page in range(2,251): #注意是从2开始才是用户信息 try: url=base_url+str(page) r=requests.get(url,headers=header) data=json.loads(r.text) all_user=data['data']['cards'][0]['card_group'] for user in all_user: fans=int(user.get('desc2').split(':')[1]) if fans >=20: f.write(str(user.get('user')['id'])+'\n') print('第{ }页用户id爬取完毕'.format(page)) time.sleep(random.randint(1,3)) except: print('未爬到数据')
爬取的部份用户ID如下:
3.以旧版微博网址,登陆后获取cookie,如右图所示:
获取后将下列代码中的cookie改为你自己的cookie即可
import numpy as np import pandas as pd import requests from lxml import etree import random import time header={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36', 'cookie':'你自己的cookie', 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', } url_new='https://weibo.cn/u/' data=[] count=0 def get_id(path): with open(path,'r') as f: user_list=f.readlines() user_id=np.char.rstrip(user_list,'\n') return user_id def gethtml(url,header): r=requests.get(url,headers=header) if r.status_code==200: return r.text else: print('网络连接异常') for user_id in get_id('user_id.txt'): try: url=url_new+user_id r_text=gethtml(url,header) html=etree.HTML(r_text.encode('utf-8')) user_name=html.xpath('//span[@class="ctt"]/text()')[0] inf=html.xpath('//span[@class="ctt"]/text()')[1] weibo_number=html.xpath('//div[@class="tip2"]/span[@class="tc"]/text()')[0].replace('微博','').strip('[]') focus_number=html.xpath('//div[@class="tip2"]/a[1]/text()')[0].replace('关注','').strip('[]') fan_number=html.xpath('//div[@class="tip2"]/a[2]/text()')[0].replace('粉丝','').strip('[]') data.append([user_name,inf,weibo_number,focus_number,fan_number]) count+=1 print('第{ }个用户信息写入完毕'.format(count)) time.sleep(random.randint(1,2)) except: print('用户信息不完全') df=pd.DataFrame(data,columns=['user_id','inf','weibo_num','focus_num','fans_num']) df.to_csv('weibo_user.csv',index=False,encoding='gb18030')